In this scenario the number of peaks in a generated dataset is varied in from 2 to 4, the rest of the parameters is kept constant (noise level = 1%). The number of peaks expected by the probabilistic model is varied between the low and high peak number.
The model used in the inference of the parameters is formulated as follows:
\begin{equation} \large y = f(x) = \sum\limits_{m=1}^M \big[A_m \cdot e^{-\frac{(x-\mu_m)^2}{2\cdot\sigma_m^2}}\big] + \epsilon \end{equation}%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
import arviz as az
#az.style.use('arviz-darkgrid')
print('Running on PyMC3 v{}'.format(pm.__version__))
import os
import sys
sys.path.append('../../modules')
import datagen as dg
import models as mdl
import results as res
import figures as fig
import settings as cnf
# output for results and images
out_path = './output_adapt_diag'
file_basename = out_path + '/scenario_peaks'
# if dir does not exist, create it
if not os.path.exists(out_path):
os.makedirs(out_path)
conf = {}
# scenario name
conf['scenario'] = 'peak variation'
# initialization method for sampler
conf['init_mode'] = 'adapt_diag'
# probabilistic model (priors)
conf['prior_model'] = 'lognormal'
# provide peak positions to the model as testvalues ('yes'/'no')
conf['peak_info'] = 'yes'
# model mode ('train'/eval')
conf['model_mode'] = 'train'
# data mode ('generate'/'preload')
conf['data_mode'] = 'generate'
# dataset directory (needed for 'preload' data mode)
#conf['dataset_dir'] = './input_datasets'
# number of cores to run sampling chains on
conf['ncores'] = 2
# number of samples per chain
conf['nsamples'] = 2000
conf
cnf.save(out_path, conf)
# list of wavelengths (x-values)
xval = [i for i in range(200, 400, 2)]
ldata = []
lpeaks = []
lpeakdata = []
# number of spectra per peak number
nsets = 5
# number of peaks in the spectrum
peak_numbers = [2,3,4,5]
# total number of datasets
tsets = nsets * len(peak_numbers)
if conf['model_mode'] == 'train' and conf['data_mode'] == 'generate':
# generate the datasets
for pn in peak_numbers:
for i in range(nsets):
df, peaks, df_peakinfo = dg.data_generator(xvalues=xval, nsamples=15, npeaks=pn)
ldata.append(df)
lpeaks.append(peaks)
lpeakdata.append(df_peakinfo)
# save data and peak information to disk
for i in range(len(ldata)):
ldata[i].to_csv(out_path + '/dataset_%02d.csv' % (i+1), index=False)
lpeakdata[i].to_csv(out_path + '/peakdata_%02d.csv' % (i+1), index=False)
dg.data_save(out_path + '/peakinfo.csv', lpeaks)
elif conf['model_mode'] == 'train' and conf['data_mode'] == 'preload':
# load pre-generated datasets from disk
ldata, lpeaks, lpeakdata = dg.data_load(tsets, conf['dataset_dir'])
else:
# load data from disk
if conf['data_mode'] == 'preload':
ldata, lpeaks, lpeakdata = dg.data_load(tsets, conf['dataset_dir'])
else:
ldata, lpeaks, lpeakdata = dg.data_load(tsets, out_path)
print("total number of peak numbers : {0}".format(len(peak_numbers)))
print("total number of spectra per peak number : {0}".format(nsets))
print("total number of datasets per model : {0}".format(tsets))
print("total number of inference runs : {0}".format(nsets*len(peak_numbers)**2))
# plot datasets
fig.plot_datasets(ldata, lpeaks, dims=(int(tsets/2),2), figure_size=(12,int(tsets*(1.8))),
savefig='yes', fname=file_basename)
# convert pandas data to numpy arrays
x_val = np.array(xval, dtype='float32')
# store dataset y-values in list
cols = ldata[0].columns
y_val = [ldata[i][cols].values for i in range(len(ldata))]
# initialize models and run inference
models = []
traces = []
lmodpeak = []
# actual run number
run = 1
# total number of inference runs
truns = nsets * len(peak_numbers)**2
for pn in peak_numbers:
if conf['model_mode'] == 'train':
# for each model (from low-high peak number) run inference on all spectra
print("running {0}-peak model".format(pn))
for i in range(len(ldata)):
if conf['peak_info'] == 'yes':
# Get the peak numbers from the list. If the actual peak number in the spectrum is
# lower than what the model is expecting, then expand the list to the expected size,
# duplicating the existing peak mu values, else truncate the list (taking the peaks
# with the highest amplitude).
plist = sorted(lpeaks[i])
if len(plist) < pn:
pl = sorted(np.resize(plist, (1,pn)).flatten())
else:
# sort peak info dataframe on amplitude value
l1 = lpeakdata[i].sort_values('amp', ascending=False)
# truncate list to expected peak number
pl = l1['mu'].values[:pn]
model_g = mdl.model_gauss(xvalues=x_val, observations=y_val[i], npeaks=pn,
mu_peaks=pl, pmodel=conf['prior_model'])
else:
model_g = mdl.model_gauss(xvalues=x_val, observations=y_val[i], npeaks=pn,
pmodel=conf['prior_model'])
models.append(model_g)
with model_g:
if conf['model_mode'] == 'train':
print("({2}/{3}) running inference on dataset #{0}/{1} [{4}-peak model:{5}-peak spectrum]"
.format(i+1,len(ldata),run,truns,pn,len(plist)))
lmodpeak += [(pn,len(plist))]
trace_g = pm.sample(conf['nsamples'], init=conf['init_mode'], cores=conf['ncores'])
traces.append(trace_g)
# save inference results
pm.backends.text.dump(out_path + '/traces_%02d' % (run), trace_g)
else:
# load traces from disk
print("loading dataset #{0}/{1}".format(run,truns))
trace_g = pm.backends.text.load(out_path + '/traces_%02d' % (run))
traces.append(trace_g)
run += 1
pm.model_to_graphviz(models[0])
# save model figure as image
img = pm.model_to_graphviz(models[0])
img.render(filename=file_basename + '_model', format='png');
# posterior predictive traces
ppc = [pm.sample_posterior_predictive(traces[i], samples=500, model=models[i]) for i in range(len(traces))]
# various plots to inspect the inference results
varnames = ['amp', 'mu', 'sigma', 'epsilon']
#az.plot_trace(traces[2], varnames, compact=True);
#az.plot_trace(traces[2], varnames, divergences='top');
#az.plot_autocorr(traces[0], varnames);
#az.plot_posterior(traces[2], varnames);
#for idx, trace in enumerate(traces):
# az.plot_forest(trace, var_names = varnames, r_hat=True, ess=True);
if conf['model_mode'] == 'train':
# collect the results and display
df = res.get_results_summary(varnames, traces, ppc, y_val, epsilon_real=0.05, sets=tsets, labels=lmodpeak)
else:
# load results from disk
df = pd.read_csv(file_basename + '.csv')
df.index += 1
#df.sort_values(by=['r2'])
df
if conf['model_mode'] == 'train':
# save results to .csv
df.to_csv(file_basename + '.csv', index=False)
fig.plot_posterior(x_val, ldata, traces, ppc, dims=(int(truns/2),2), figure_size=(12,int(truns*(1.8))),
savefig='yes', fname=file_basename, showpeaks='no', sets=tsets)
fig.plot_posterior(x_val, ldata, traces, ppc, dims=(int(truns/2),2), figure_size=(12,int(truns*(1.8))),
savefig='yes', fname=file_basename, showpeaks='yes', sets=tsets)
cnf.close(out_path)